Bag of Words demo

This is a demo for class SURV 622/SURVMETH 622. It contains two sections: Bag of words model and Word2Vec model for representation learning.

#Load Packages
#install.packages(c('tm','word2vec','udpipe', 'uwot','glmnet';))
library(tm)
## Loading required package: NLP
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(word2vec)
library(uwot)
## Loading required package: Matrix
library(glmnet)
## Loaded glmnet 4.1-3
cos.sim <- function(a,b) 
{

    return( sum(a*b)/sqrt(sum(a^2)*sum(b^2)) )
}   

Read the data

Data are downloaded from Kaggle (https://www.kaggle.com/datasets/kazanova/sentiment140). The tweets have been annotated (0 = negative, 4 = positive) and they can be used to detect sentiment.

tweets = read.csv("training.100000.processed.noemoticon.csv",stringsAsFactors=FALSE, encoding = "UTF-8")

Check the text

The data has been prepossessed. Let’s take a look.

head(tweets)
##   target        ids                         date     flag           user
## 1      4 2053333385 Sat Jun 06 04:10:15 PDT 2009 NO_QUERY        meggz15
## 2      0 1964093465 Fri May 29 13:42:51 PDT 2009 NO_QUERY  BattleBabeeyx
## 3      4 1980616407 Sun May 31 07:13:44 PDT 2009 NO_QUERY mrswilliams815
## 4      4 2174460574 Sun Jun 14 22:35:57 PDT 2009 NO_QUERY     abbiereyes
## 5      0 1754448486 Sun May 10 06:18:29 PDT 2009 NO_QUERY         cradow
## 6      4 2174736346 Sun Jun 14 23:11:51 PDT 2009 NO_QUERY       aurattii
##                                                                                              text
## 1                                                                    is spendingg time with ness 
## 2                                                 @buckhollywood aw it was so sad  shes too cute!
## 3                                                                        Back to Barstow today!! 
## 4                                            @thedailysurvey vote ko po touch my hand.  thanks po
## 5                                                            needs Twilight! I feel so addicted. 
## 6 @mungob yay nice to hear that u had a good day 4 a change cudn't happened 2 a more deservin MG

Check missing data

sum(is.na(tweets$target))
## [1] 0
sum(is.na(tweets$ids))
## [1] 0
sum(is.na(tweets$text))
## [1] 0

This line is to address some encoding error.

tweets$text = iconv(tweets$text,"WINDOWS-1252","UTF-8") 

Create a document-term matrix

Prepossessing Road map: 1. lower all character 2. remove numbers 3. remove punctuation 4. remove stop words

tweet_corpus = Corpus(VectorSource(tweets$text))
tweet_corpus = tm_map(tweet_corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(tweet_corpus, content_transformer(tolower)):
## transformation drops documents
tweet_corpus = tm_map(tweet_corpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(tweet_corpus, removeNumbers): transformation
## drops documents
tweet_corpus = tm_map(tweet_corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(tweet_corpus, removePunctuation): transformation
## drops documents
tweet_corpus = tm_map(tweet_corpus, removeWords, c("the", "and", stopwords("english")))
## Warning in tm_map.SimpleCorpus(tweet_corpus, removeWords, c("the", "and", :
## transformation drops documents
tweet_corpus =  tm_map(tweet_corpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(tweet_corpus, stripWhitespace): transformation
## drops documents

Convert word vector to document-term matrix

tweet_dtm = DocumentTermMatrix(tweet_corpus)
tweet_dtm
## <<DocumentTermMatrix (documents: 100000, terms: 106647)>>
## Non-/sparse entries: 729556/10663970444
## Sparsity           : 100%
## Maximal term length: 141
## Weighting          : term frequency (tf)

Take a quick look at first 15 documents and terms.

inspect(tweet_dtm[1:15, 1:15])
## <<DocumentTermMatrix (documents: 15, terms: 15)>>
## Non-/sparse entries: 19/206
## Sparsity           : 92%
## Maximal term length: 14
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs back barstow buckhollywood cute ness sad shes spendingg time today
##   1     0       0             0    0    1   0    0         1    1     0
##   15    0       0             0    0    0   0    0         0    1     1
##   2     0       0             1    1    0   1    1         0    0     0
##   3     1       1             0    0    0   0    0         0    0     1
##   4     0       0             0    0    0   0    0         0    0     0
##   5     0       0             0    0    0   0    0         0    0     0
##   6     0       0             0    0    0   0    0         0    0     0
##   7     0       0             0    0    0   0    0         0    0     0
##   8     1       0             0    0    0   0    0         0    0     1
##   9     0       0             0    0    0   0    0         0    0     0

Drop most of the rare terms to make the matrix dense.

tweet_dtm = removeSparseTerms(tweet_dtm, 0.99)
tweet_dtm
## <<DocumentTermMatrix (documents: 100000, terms: 88)>>
## Non-/sparse entries: 194642/8605358
## Sparsity           : 98%
## Maximal term length: 8
## Weighting          : term frequency (tf)

Implement Latent Semantic Analysis (LSA)

tweet_svd = svd(tweet_dtm)

Review the latent topic for the first 15 documents.

heatmap(tweet_svd$u[1:15,1:15])

Word2Vec

Start from the scratch

We will build a Word2Vec by ourselves using CBOW architecture in the context of tweets we have seen.

Train the model

set.seed(1015)
model = word2vec(x = tweets$text, type = "cbow", dim = 15, iter = 10)

Visulize the word relation using UMAP

embedding = as.matrix(model)
viz = umap(embedding, n_neighbors = 15, n_threads = 2)
df  = data.frame(word = gsub("//.+", "", rownames(embedding)), 
                  xpos = gsub(".+//", "", rownames(embedding)), 
                  x = viz[, 1], y = viz[, 2], 
                  stringsAsFactors = FALSE)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plot_ly(df[500:700,], x = ~x, y = ~y, type = "scatter", mode = 'text', text = ~word)

Using pretrained Word2Vec model

model = read.word2vec("18/model.bin", normalize = TRUE)

Select any word and its neighbors (in terms of sematic similarity)

predict(model, newdata = c("fries", "money"), type = "nearest", top_n = 5)
## $fries
##   term1      term2 similarity rank
## 1 fries     french  0.9076371    1
## 2 fries    burgers  0.8938329    2
## 3 fries hamburgers  0.8827033    3
## 4 fries sandwiches  0.8591300    4
## 5 fries      fried  0.8587483    5
## 
## $money
##   term1  term2 similarity rank
## 1 money monies  0.8541762    1
## 2 money  funds  0.8506691    2
## 3 money   cash  0.8144652    3
## 4 money   sums  0.8034646    4
## 5 money   fund  0.7885280    5

Arithmetic of word vector

wv = predict(model, newdata = c("king", "man", "woman"), type = "embedding")
wv = wv["king", ] - wv["man", ] + wv["woman", ]
predict(model, newdata = wv, type = "nearest", top_n = 3)
##      term similarity rank
## 1    king  0.9670773    1
## 2   queen  0.9044872    2
## 3 monarch  0.8917280    3

Implicit stereotype

Let check the word nurse will more similar to man or woman

wv_woman = predict(model, newdata = c("woman"), type = "embedding")
wv_man = predict(model, newdata = c("man"), type = "embedding")
wv_nurse = predict(model, newdata = c("nurse"), type = "embedding")
cos.sim(wv_woman, wv_nurse)
## [1] 0.5568123
cos.sim(wv_man, wv_nurse)
## [1] 0.3725856

Sentiment Analysis

tweets[tweets$target==4,'target'] = 1
X = as.matrix(tweet_dtm)
X_train = X[1:80000,]
X_test = X[80001:100000,]
y_train = tweets$target[1:80000]
y_test = tweets$target[80001:100000]

Using LASSO to fit the prediction model of sentiment

Calling cross validation to test best hyper parameter (lambda)

cv_model = cv.glmnet(X_train, y_train, alpha = 1,family='binomial')
best_lambda <- cv_model$lambda.min

chose the best lambda value

best_model = glmnet(X_train, y_train, alpha = 1, lambda = best_lambda)
coef(best_model)
## 89 x 1 sparse Matrix of class "dgCMatrix"
##                       s0
## (Intercept)  0.509615405
## time         0.020775331
## sad         -0.367571290
## back        -0.041753708
## today       -0.049851972
## thanks       0.301470798
## feel        -0.172824045
## day          0.003479450
## good         0.159719444
## nice         0.208868687
## much        -0.056079161
## never       -0.129614258
## thank        0.314303077
## dont        -0.120005905
## bad         -0.259986504
## love         0.207308429
## now         -0.029410892
## see          0.064532395
## can          0.066355604
## make         0.022973075
## think       -0.011464786
## tomorrow    -0.028943130
## amp          0.030020502
## fun          0.130560344
## get         -0.031541247
## way          0.014475582
## want        -0.146190158
## one         -0.009686349
## haha         0.141985881
## really      -0.072731958
## thats        0.033111237
## come        -0.003404260
## didnt       -0.170698351
## great        0.205186837
## last        -0.128196002
## night        0.046721516
## happy        0.244948277
## just         0.012958240
## little       .          
## though      -0.107214118
## week        -0.030377983
## sorry       -0.280943483
## watching     0.150672116
## lol          0.093223352
## hope         0.011095269
## know         0.015207428
## need        -0.110259071
## working     -0.119563625
## wait         0.216694965
## even        -0.042661806
## people       .          
## new          0.112382486
## next        -0.009084835
## weekend     -0.006243249
## miss        -0.320268474
## tonight      0.020843349
## will         0.062331529
## still       -0.152095628
## cant        -0.209224245
## yes          0.139856060
## work        -0.156535604
## got         -0.018286598
## well         0.032721052
## days         .          
## long        -0.081132273
## gonna       -0.022229080
## first        0.098387792
## ive         -0.052330492
## youre        0.168850403
## wish        -0.305522451
## bed         -0.020719139
## sleep       -0.089245488
## best         0.150142444
## right        .          
## better       0.062316862
## twitter      0.065944365
## hey          0.181833899
## home        -0.077322460
## hate        -0.309012170
## like         0.001146549
## going        0.001167402
## morning      0.054679125
## awesome      0.221887130
## take         .          
## ill          0.068789499
## getting     -0.017765084
## yeah         0.055115221
## soon         0.016151073
## school      -0.047642548

Plot the coef of Lasso model

coef = as.data.frame(as.matrix(coef(best_model)))
coef = cbind(coef,term = row.names(coef))
coef = coef[order(coef$s0),]
ggplot(coef, 
       aes(x = reorder(term, s0), y = s0)) +
        geom_point() + 
        coord_flip() +
  theme_bw()

Test Model Perfermance

Using rmse as test metrics

y_predicted <- predict(best_model, s = best_lambda, newx = X_test)
y_predicted <- ifelse(y_predicted>0.5,1,0)
sum(y_test == y_predicted)/20000
## [1] 0.64455